In [1]:
import pandas as pd
In [3]:
df = pd.read_csv('city.csv', sep=';')
df
Out[3]:
ID Name CountryCode District Population
0 1 Kabul AFG Kabol 1780000
1 2 Qandahar AFG Qandahar 237500
2 3 Herat AFG Herat 186800
3 4 Mazar-e-Sharif AFG Balkh 127800
4 5 Amsterdam NLD Noord-Holland 731200
... ... ... ... ... ...
4074 4075 Khan Yunis PSE Khan Yunis 123175
4075 4076 Hebron PSE Hebron 119401
4076 4077 Jabaliya PSE North Gaza 113901
4077 4078 Nablus PSE Nablus 100231
4078 4079 Rafah PSE Rafah 92020

4079 rows × 5 columns

In [8]:
df.head(10)
Out[8]:
ID Name CountryCode District Population
0 1 Kabul AFG Kabol 1780000
1 2 Qandahar AFG Qandahar 237500
2 3 Herat AFG Herat 186800
3 4 Mazar-e-Sharif AFG Balkh 127800
4 5 Amsterdam NLD Noord-Holland 731200
5 6 Rotterdam NLD Zuid-Holland 593321
6 7 Haag NLD Zuid-Holland 440900
7 8 Utrecht NLD Utrecht 234323
8 9 Eindhoven NLD Noord-Brabant 201843
9 10 Tilburg NLD Noord-Brabant 193238
In [9]:
df.tail(10)
Out[9]:
ID Name CountryCode District Population
4069 4070 Chitungwiza ZWE Harare 274912
4070 4071 Mount Darwin ZWE Harare 164362
4071 4072 Mutare ZWE Manicaland 131367
4072 4073 Gweru ZWE Midlands 128037
4073 4074 Gaza PSE Gaza 353632
4074 4075 Khan Yunis PSE Khan Yunis 123175
4075 4076 Hebron PSE Hebron 119401
4076 4077 Jabaliya PSE North Gaza 113901
4077 4078 Nablus PSE Nablus 100231
4078 4079 Rafah PSE Rafah 92020
In [10]:
len(df)
Out[10]:
4079
In [11]:
df.shape
Out[11]:
(4079, 5)
In [13]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4079 entries, 0 to 4078
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           4079 non-null   int64 
 1   Name         4079 non-null   object
 2   CountryCode  4079 non-null   object
 3   District     4075 non-null   object
 4   Population   4079 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 159.5+ KB
In [15]:
df[ df['District'].isna() ]
Out[15]:
ID Name CountryCode District Population
3284 3285 Taiping TWN NaN 165524
3292 3293 Taliao TWN NaN 115897
3293 3294 Kueishan TWN NaN 112195
3562 3563 Ciudad Losada VEN NaN 134501
In [16]:
df.describe()
Out[16]:
ID Population
count 4079.000000 4.079000e+03
mean 2040.000000 3.504682e+05
std 1177.650203 7.237757e+05
min 1.000000 4.200000e+01
25% 1020.500000 1.147890e+05
50% 2040.000000 1.670510e+05
75% 3059.500000 3.106385e+05
max 4079.000000 1.050000e+07
In [17]:
df.CountryCode.unique()
Out[17]:
array(['AFG', 'NLD', 'ANT', 'ALB', 'DZA', 'ASM', 'AND', 'AGO', 'AIA',
       'ATG', 'ARE', 'ARG', 'ARM', 'ABW', 'AUS', 'AZE', 'BHS', 'BHR',
       'BGD', 'BRB', 'BEL', 'BLZ', 'BEN', 'BMU', 'BTN', 'BOL', 'BIH',
       'BWA', 'BRA', 'GBR', 'VGB', 'BRN', 'BGR', 'BFA', 'BDI', 'CYM',
       'CHL', 'COK', 'CRI', 'DJI', 'DMA', 'DOM', 'ECU', 'EGY', 'SLV',
       'ERI', 'ESP', 'ZAF', 'ETH', 'FLK', 'FJI', 'PHL', 'FRO', 'GAB',
       'GMB', 'GEO', 'GHA', 'GIB', 'GRD', 'GRL', 'GLP', 'GUM', 'GTM',
       'GIN', 'GNB', 'GUY', 'HTI', 'HND', 'HKG', 'SJM', 'IDN', 'IND',
       'IRQ', 'IRN', 'IRL', 'ISL', 'ISR', 'ITA', 'TMP', 'AUT', 'JAM',
       'JPN', 'YEM', 'JOR', 'CXR', 'YUG', 'KHM', 'CMR', 'CAN', 'CPV',
       'KAZ', 'KEN', 'CAF', 'CHN', 'KGZ', 'KIR', 'COL', 'COM', 'COG',
       'COD', 'CCK', 'PRK', 'KOR', 'GRC', 'HRV', 'CUB', 'KWT', 'CYP',
       'LAO', 'LVA', 'LSO', 'LBN', 'LBR', 'LBY', 'LIE', 'LTU', 'LUX',
       'ESH', 'MAC', 'MDG', 'MKD', 'MWI', 'MDV', 'MYS', 'MLI', 'MLT',
       'MAR', 'MHL', 'MTQ', 'MRT', 'MUS', 'MYT', 'MEX', 'FSM', 'MDA',
       'MCO', 'MNG', 'MSR', 'MOZ', 'MMR', 'NAM', 'NRU', 'NPL', 'NIC',
       'NER', 'NGA', 'NIU', 'NFK', 'NOR', 'CIV', 'OMN', 'PAK', 'PLW',
       'PAN', 'PNG', 'PRY', 'PER', 'PCN', 'MNP', 'PRT', 'PRI', 'POL',
       'GNQ', 'QAT', 'FRA', 'GUF', 'PYF', 'REU', 'ROM', 'RWA', 'SWE',
       'SHN', 'KNA', 'LCA', 'VCT', 'SPM', 'DEU', 'SLB', 'ZMB', 'WSM',
       'SMR', 'STP', 'SAU', 'SEN', 'SYC', 'SLE', 'SGP', 'SVK', 'SVN',
       'SOM', 'LKA', 'SDN', 'FIN', 'SUR', 'SWZ', 'CHE', 'SYR', 'TJK',
       'TWN', 'TZA', 'DNK', 'THA', 'TGO', 'TKL', 'TON', 'TTO', 'TCD',
       'CZE', 'TUN', 'TUR', 'TKM', 'TCA', 'TUV', 'UGA', 'UKR', 'HUN',
       'URY', 'NCL', 'NZL', 'UZB', 'BLR', 'WLF', 'VUT', 'VAT', 'VEN',
       'RUS', 'VNM', 'EST', 'USA', 'VIR', 'ZWE', 'PSE'], dtype=object)
In [19]:
# len(df.CountryCode.unique())
df.CountryCode.unique().size
Out[19]:
232
In [20]:
df.values
Out[20]:
array([[1, 'Kabul', 'AFG', 'Kabol', 1780000],
       [2, 'Qandahar', 'AFG', 'Qandahar', 237500],
       [3, 'Herat', 'AFG', 'Herat', 186800],
       ...,
       [4077, 'Jabaliya', 'PSE', 'North Gaza', 113901],
       [4078, 'Nablus', 'PSE', 'Nablus', 100231],
       [4079, 'Rafah', 'PSE', 'Rafah', 92020]], dtype=object)
In [21]:
df.index
Out[21]:
RangeIndex(start=0, stop=4079, step=1)
In [22]:
df.columns
Out[22]:
Index(['ID', 'Name', 'CountryCode', 'District', 'Population'], dtype='object')
In [23]:
# подсчет с группировкой и сортировкой
df.CountryCode.value_counts()
Out[23]:
CHN    363
IND    341
USA    274
BRA    250
JPN    248
      ... 
MHL      1
SYC      1
LUX      1
LBR      1
LSO      1
Name: CountryCode, Length: 232, dtype: int64
In [24]:
df.CountryCode.value_counts(normalize=True)
Out[24]:
CHN    0.088992
IND    0.083599
USA    0.067173
BRA    0.061290
JPN    0.060799
         ...   
MHL    0.000245
SYC    0.000245
LUX    0.000245
LBR    0.000245
LSO    0.000245
Name: CountryCode, Length: 232, dtype: float64
In [ ]: